Corpus similarity

The goal of this notebook is to compare the two corpuses -- the final and the homework, to find some sort of difference between the two


In [1]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *

In [2]:
# Loading in the two corpuses
notebooks = [os.path.join('../hw_corpus', fname) for fname in os.listdir('../hw_corpus')]
hw_notebook_objs = [NotebookMiner(file) for file in notebooks]

people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [3]:
from nbminer.stats.multiple_summary import MultipleSummary
hw_summary = MultipleSummary(hw_notebook_objs)
final_summary = MultipleSummary(notebook_objs)

In [4]:
print("Number of Final notebooks: ", len(final_summary.summary_vec))
print("Number of Homework notebooks: ", len(hw_summary.summary_vec))


Number of Final notebooks:  177
Number of Homework notebooks:  464

In [5]:
print("Average number of cells, Final: ", final_summary.average_number_of_cells())
print("Average number of cells, Homework: ", hw_summary.average_number_of_cells())


Average number of cells, Final:  68.92090395480226
Average number of cells, Homework:  36.42672413793103

In [6]:
print("Average lines of code, Final: ", final_summary.average_lines_of_code())
print("Average lines of code, Homework: ", hw_summary.average_lines_of_code())


Average lines of code, Final:  271.3502824858757
Average lines of code, Homework:  197.14008620689654

In [ ]:

Combined Clustering


In [7]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x15238d07f0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x15238d0828>
<nbminer.preprocess.get_imports.GetImports object at 0x1a2e7d8b00>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a2e7d8780>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a2e81b0f0>
<nbminer.results.similarity.jaccard_similarity.NotebookJaccardSimilarity object at 0x1a2e81bda0>

In [8]:
import numpy as np
intra, inter = njs.group_average_jaccard_similarity('group_1')
print('Mean within group: ', np.mean(np.array(intra)))
print('STD within group: ', np.std(np.array(intra)))
print('Mean outside group: ', np.mean(np.array(inter)))
print('STD outside group: ', np.std(np.array(inter)))


Mean within group:  0.223739330482
STD within group:  0.0429085293912
Mean outside group:  0.220370504491
STD outside group:  0.0439395959595

In [9]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 10)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a43adc9b0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151ab42da0>
<nbminer.preprocess.get_imports.GetImports object at 0x1a406224a8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x10a84fef0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a404b4978>
<nbminer.results.similarity.jaccard_similarity.NotebookJaccardSimilarity object at 0x1a3b7acb70>

Prediction of group


In [10]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 10)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ci])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a5c495198>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a592c0048>
<nbminer.preprocess.get_imports.GetImports object at 0x1a592c06d8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a5ced0358>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a5ced01d0>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x1a5cf31048>

In [25]:
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)


0.493810386473
Out[25]:
[<matplotlib.lines.Line2D at 0x1a9641e630>]

In [3]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_simple_features import GetSimpleFeatures
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier

a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gsf = GetSimpleFeatures()
ci = CorpusIdentifier(feature_name='string')
pipe = Pipeline([gsf, ci])
a = pipe.transform(a)


<nbminer.preprocess.get_simple_features.GetSimpleFeatures object at 0x1063b9a90>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x150ade57f0>

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)


0.42789296884
Out[10]:
[<matplotlib.lines.Line2D at 0x151c9d1550>]

In [ ]: